{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3",
   "language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "source": [
    "# 清除未标记诊断信息的数据"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入包\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "source": [
    "## 读入数据"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_filepath = \"../data/numpy_data/\"\n",
    "\n",
    "X_train = np.load(data_filepath + 'X_train.npy')\n",
    "y_train = np.load(data_filepath + 'y_train.npy', allow_pickle=True)\n",
    "X_test = np.load(data_filepath + 'X_test.npy')\n",
    "y_test = np.load(data_filepath + 'y_test.npy', allow_pickle=True)\n",
    "\n",
    "# reshape y_train, y_test\n",
    "y_train = y_train.reshape(len(y_train), 1)\n",
    "y_test = y_test.reshape(len(y_test), 1)"
   ]
  },
  {
   "source": [
    "### 检查数据维度是否一致\n",
    "\n",
    "**一定注意X和y的第一维度一致，且y的维度为(nums,1)**"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(16966, 500, 12)"
      ]
     },
     "metadata": {},
     "execution_count": 3
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(16966, 1)"
      ]
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(1901, 500, 12)"
      ]
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "source": [
    "X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(1901, 1)"
      ]
     },
     "metadata": {},
     "execution_count": 6
    }
   ],
   "source": [
    "y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(16966, 1)"
      ]
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "source": [
    "y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([[list(['NORM'])],\n",
       "       [list(['NORM'])],\n",
       "       [list(['NORM'])],\n",
       "       ...,\n",
       "       [list(['STTC'])],\n",
       "       [list(['NORM'])],\n",
       "       [list(['NORM'])]], dtype=object)"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_list = y_train.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[[['NORM']],\n",
       " [['NORM']],\n",
       " [['NORM']],\n",
       " [['NORM']],\n",
       " [['NORM']],\n",
       " [['NORM']],\n",
       " [['MI']],\n",
       " [['NORM']],\n",
       " [['NORM']],\n",
       " [['NORM']]]"
      ]
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "source": [
    "y_list[0:10]"
   ]
  },
  {
   "source": [
    "## 清除没有诊断信息的数据\n",
    "\n",
    "这个`label`中有一些数据没有诊断信息，在训练中是无法处理的，必须剔除掉，如下打印的`y_train`所示，其中有一个奇怪的数据类型`<class 'numpy.ndarray'>`，打印的结果是`[list([])]`，搞了半天实在不懂是个啥子数据类型，直接从数据集中通过`temp`取出来，然后对比原数据集处理"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取这个奇怪的空诊断信息\n",
    "temp = None\n",
    "for items in y_train:\n",
    "    if len(items[0]) is 0:\n",
    "       print(type(items)) \n",
    "       print(items)\n",
    "       temp = items\n",
    "       continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([    0,     1,     2, ..., 16963, 16964, 16965])"
      ]
     },
     "metadata": {},
     "execution_count": 12
    }
   ],
   "source": [
    "# 没有诊断信息的训练集\n",
    "np.where(y_train != temp)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([], dtype=int64)"
      ]
     },
     "metadata": {},
     "execution_count": 13
    }
   ],
   "source": [
    "# 没有诊断信息的测试集\n",
    "np.where(y_test == temp)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 重新调整数据集\n",
    "X_train = X_train[np.where(y_train != temp)[0]]\n",
    "y_train = y_train[np.where(y_train != temp)[0]]\n",
    "\n",
    "X_test = X_test[np.where(y_test != temp)[0]]\n",
    "y_test = y_test[np.where(y_test != temp)[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save_path = '../data/numpy_data/'\n",
    "# np.save(save_path+'X_train.npy', X_train)\n",
    "# np.save(save_path+'y_train.npy', np.array(y_train))\n",
    "# np.save(save_path+'X_test.npy', X_test)\n",
    "# np.save(save_path+'y_test.npy', np.array(y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "ecg = X_train[np.random.randint(len(X_train))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Using matplotlib backend: GTK3Agg\n"
     ]
    }
   ],
   "source": [
    "%matplotlib auto\n",
    "plt.figure()\n",
    "for index in range(12):\n",
    "    plt.subplot(6, 2, index+1)\n",
    "    plt.plot(ecg[:,index])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}